knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

# attach packages
library(tidyverse)
library(here)
library(tidytext)
library(textdata)
library(pdftools)
library(ggwordcloud)
library(kableExtra)

Get the lyrics for Act 1 and Act 2

ham_act1 <- pdf_text(here("data", "hamilton_act1.pdf"))

ham_act2 <- pdf_text(here("data", "hamilton_act2.pdf"))

Convert text to a dataframe

act1_lines <- data.frame(ham_act1) %>% 
  mutate(page = 1:n()) %>% 
  mutate(text_act1 = str_split(ham_act1, pattern = "\\n")) %>% 
  unnest(text_act1) %>% 
  mutate(text_act1 = str_trim(text_act1))

act2_lines <- data.frame(ham_act2) %>% 
  mutate(page = 1:n()) %>% 
  mutate(text_act2 = str_split(ham_act2, pattern = "\\n")) %>% 
  unnest(text_act2) %>% 
  mutate(text_act2 = str_trim(text_act2))

Do some tidying

tunes_act1 <- act1_lines %>% 
  mutate(song = ifelse(str_detect(text_act1, "Song"), text_act1, NA)) %>% 
  fill(song, .direction = "down") %>% 
  separate(col = song, into = c("so", "no"), sep = " ") %>% 
  mutate(song = as.numeric(as.roman(no)))

tunes_act2 <- act2_lines %>% 
  mutate(song = ifelse(str_detect(text_act2, "Song"), text_act2, NA)) %>% 
  fill(song, .direction = "down") %>% 
  separate(col = song, into = c("so", "no"), sep = " ") %>% 
  mutate(song = as.numeric(as.roman(no)))

Find word count per act by song

words_act1 <- tunes_act1 %>% 
  unnest_tokens(word, text_act1) %>% 
  select(-ham_act1)
  
words_act2 <- tunes_act2 %>% 
  unnest_tokens(word, text_act2) %>% 
  select(-ham_act2)
act1_wordcount <- words_act1 %>% 
  count(song, word)

act2_wordcount <- words_act2 %>% 
  count(song, word)

Remove Stop Words

# head(stop_words)

words_act1_clean <- words_act1 %>% 
  anti_join(stop_words, by = "word")

words_act2_clean <- words_act2 %>% 
  anti_join(stop_words, by = "word")
act1_nonstop_counts <- words_act1_clean %>% 
  count(song, word)

act2_nonstop_counts <- words_act2_clean %>% 
  count(song, word)

Find top 5 words from each song in each Act

act1_top5_words <- act1_nonstop_counts %>%
  group_by(song) %>%
  arrange(-n) %>%
  slice(1:5) %>%
  ungroup()

act2_top5_words <- act2_nonstop_counts %>%
  group_by(song) %>%
  arrange(-n) %>%
  slice(1:5) %>%
  ungroup()

Act 1

Songs 1-5

act1_1_5 <- act1_top5_words %>% 
  filter(song %in% c("1", "2", "3", "4", "5")) %>% 
  mutate(song = case_when(
    song == "1" ~ "Alexander Hamilton",
    song == "2" ~ "Aaron Burr, Sir",
    song == "3" ~ "My Shot",
    song == "4" ~ "The Story of Tonight",
    song == "5" ~ "The Schuyler Sisters"
  ))

ggplot(data = act1_1_5,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Songs 6-10

act1_6_10 <- act1_top5_words %>% 
  filter(song %in% c("6", "7", "8", "9", "10")) %>% 
  mutate(song = case_when(
    song == "6" ~ "Farmer Refuted",
    song == "7" ~ "You'll Be Back",
    song == "8" ~ "Right Hand Man",
    song == "9" ~ "A Winter's Ball",
    song == "10" ~ "Helpless"
  ))

ggplot(data = act1_6_10,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Songs 11-15

act1_11_15 <- act1_top5_words %>% 
  filter(song %in% c("11", "12", "13", "14", "15")) %>% 
  mutate(song = case_when(
    song == "11" ~ "Satisfied",
    song == "12" ~ "The Story of \nTonight (Reprise)",
    song == "13" ~ "Wait for It",
    song == "14" ~ "Stay Alive",
    song == "15" ~ "Ten Duel \nCommandments"
  ))

ggplot(data = act1_11_15,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Songs 16-20

act1_16_20 <- act1_top5_words %>% 
  filter(song %in% c("16", "17", "18", "19", "20")) %>% 
  mutate(song = case_when(
    song == "16" ~ "Meet Me Inside",
    song == "17" ~ "That Would Be Enough",
    song == "18" ~ "Guns and Ships",
    song == "19" ~ "History Has Its \nEyes on You",
    song == "20" ~ "Yorktown (The World \nTurned Upside Down)"
  ))

ggplot(data = act1_16_20,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Songs 21-23

act1_21_23 <- act1_top5_words %>% 
  filter(song %in% c("21", "22", "23")) %>% 
  mutate(song = case_when(
    song == "21" ~ "What Comes Next?",
    song == "22" ~ "Dear Theodosia",
    song == "23" ~ "Non-Stop"
  ))

ggplot(data = act1_21_23,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Act 2

Songs 1-5

act2_1_5 <- act2_top5_words %>% 
  filter(song %in% c("1", "2", "3", "4", "5")) %>% 
  mutate(song = case_when(
    song == "1" ~ "What'd I Miss",
    song == "2" ~ "Cabinet Battle #1",
    song == "3" ~ "Take a Break",
    song == "4" ~ "Say No to This",
    song == "5" ~ "The Room Where \nIt Happens"
  ))

ggplot(data = act2_1_5,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Songs 6-10

act2_6_10 <- act2_top5_words %>% 
  filter(song %in% c("6", "7", "8", "9", "10")) %>% 
  mutate(song = case_when(
    song == "6" ~ "Schuyler Defeated",
    song == "7" ~ "Cabinet Battle #2",
    song == "8" ~ "Washington on Your Side",
    song == "9" ~ "One Last Time",
    song == "10" ~ "I Know Him"
  ))

ggplot(data = act2_6_10,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Songs 11-15

act2_11_15 <- act2_top5_words %>% 
  filter(song %in% c("11", "12", "13", "14", "15")) %>% 
  mutate(song = case_when(
    song == "11" ~ "The Adams Administration",
    song == "12" ~ "We Know",
    song == "13" ~ "Hurricane",
    song == "14" ~ "The Reynolds Pamphlet",
    song == "15" ~ "Burn"
  ))

ggplot(data = act2_11_15,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Songs 16-20

act2_16_20 <- act2_top5_words %>% 
  filter(song %in% c("16", "17", "18", "19", "20")) %>% 
  mutate(song = case_when(
    song == "16" ~ "Blow Us All Away",
    song == "17" ~ "Stay Alive - Reprise",
    song == "18" ~ "It's Quiet Uptown",
    song == "19" ~ "The Election of 1800",
    song == "20" ~ "Your Obedient Servant"
  ))

ggplot(data = act2_16_20,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Songs 21-23

act2_21_23 <- act2_top5_words %>% 
  filter(song %in% c("21", "22", "23")) %>% 
  mutate(song = case_when(
    song == "21" ~ "Best of Wives and \nBest of Women",
    song == "22" ~ "The World Was Wide Enough",
    song == "23" ~ "Who Lives, Who Dies, \n Who Tells Your Story"
  ))

ggplot(data = act2_21_23,
       aes(x = n, y = word)) +
  geom_col() +
  facet_wrap(~song, scales = "free")

Word clouds of top 100 words in each act

act1_top100 <- act1_nonstop_counts %>% 
  arrange(-n) %>% 
  slice(1:100)

act2_top100 <- act2_nonstop_counts %>% 
  arrange(-n) %>% 
  slice(1:100)

Act 1

act1_cloud <- ggplot(data = act1_top100,
                     aes(label = word)) +
  geom_text_wordcloud(aes(color = n, size = n),
                      shape = "pentagon",
                      eccentricity = 0.4) +
  scale_size_area(max_size = 9) +
  scale_color_gradientn(colors = c("darkgreen", "blue", "purple")) +
  theme_minimal()

act1_cloud

Act 2

act2_cloud <- ggplot(data = act2_top100,
                     aes(label = word)) +
  geom_text_wordcloud(aes(color = n, size = n),
                      shape = "pentagon") +
  scale_color_gradientn(colors = c("darkgreen", "blue", "purple")) +
  scale_size_area(max_size = 6) +
  theme_minimal()

act2_cloud

Sentiment Analysis

“afinn” Lexicon

Act 1

act1_afinn <- words_act1_clean %>% 
  inner_join(get_sentiments("afinn"), by = "word")

act1_afinn_counts <- act1_afinn %>% 
  count(song, value)
Songs 1-5
act1_afinn_1_5 <- act1_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("1", "2", "3", "4", "5")) %>% 
  mutate(song = case_when(
    song == "1" ~ "Alexander Hamilton",
    song == "2" ~ "Aaron Burr, Sir",
    song == "3" ~ "My Shot",
    song == "4" ~ "The Story of Tonight",
    song == "5" ~ "The Schuyler Sisters"
  ))

ggplot(data = act1_afinn_1_5,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Songs 6-10
act1_afinn_6_10 <- act1_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("6", "7", "8", "9", "10")) %>% 
  mutate(song = case_when(
    song == "6" ~ "Farmer Refuted",
    song == "7" ~ "You'll Be Back",
    song == "8" ~ "Right Hand Man",
    song == "9" ~ "A Winter's Ball",
    song == "10" ~ "Helpless"
  ))

ggplot(data = act1_afinn_6_10,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Songs 11-15
act1_afinn_11_15 <- act1_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("11", "12", "13", "14", "15")) %>% 
  mutate(song = case_when(
    song == "11" ~ "Satisfied",
    song == "12" ~ "The Story of \nTonight (Reprise)",
    song == "13" ~ "Wait for It",
    song == "14" ~ "Stay Alive",
    song == "15" ~ "Ten Duel \nCommandments"
  ))

ggplot(data = act1_afinn_11_15,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Songs 16 - 20
act1_afinn_16_20 <- act1_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("16", "17", "18", "19", "20")) %>% 
  mutate(song = case_when(
    song == "16" ~ "Meet Me Inside",
    song == "17" ~ "That Would Be Enough",
    song == "18" ~ "Guns and Ships",
    song == "19" ~ "History Has Its \nEyes on You",
    song == "20" ~ "Yorktown (The World \nTurned Upside Down)"
  ))

ggplot(data = act1_afinn_16_20,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Songs 21-23
act1_afinn_21_23 <- act1_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("21", "22", "23")) %>% 
  mutate(song = case_when(
    song == "21" ~ "What Comes Next?",
    song == "22" ~ "Dear Theodosia",
    song == "23" ~ "Non-Stop"
  ))

ggplot(data = act1_afinn_21_23,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Mean
act1_afinn_means <- act1_afinn %>% 
  group_by(song) %>% 
  summarize(mean_afinn = mean(value))

ggplot(data = act1_afinn_means,
       aes(x = fct_rev(factor(song)),
           y = mean_afinn)) +
  geom_col() +
  coord_flip()

Act 2

act2_afinn <- words_act2_clean %>% 
  inner_join(get_sentiments("afinn"), by = "word")

act2_afinn_counts <- act2_afinn %>% 
  count(song, value)
Songs 1-5
act2_afinn_1_5 <- act2_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("1", "2", "3", "4", "5")) %>% 
  mutate(song = case_when(
    song == "1" ~ "What'd I Miss",
    song == "2" ~ "Cabinet Battle #1",
    song == "3" ~ "Take a Break",
    song == "4" ~ "Say No to This",
    song == "5" ~ "The Room Where \nIt Happens"
  ))

ggplot(data = act2_afinn_1_5,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Songs 6-10
act2_afinn_6_10 <- act2_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("6", "7", "8", "9", "10")) %>% 
  mutate(song = case_when(
    song == "6" ~ "Schuyler Defeated",
    song == "7" ~ "Cabinet Battle #2",
    song == "8" ~ "Washington on Your Side",
    song == "9" ~ "One Last Time",
    song == "10" ~ "I Know Him"
  ))

ggplot(data = act2_afinn_6_10,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Songs 11-15
act2_afinn_11_15 <- act2_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("11", "12", "13", "14", "15")) %>% 
  mutate(song = case_when(
    song == "11" ~ "The Adams Administration",
    song == "12" ~ "We Know",
    song == "13" ~ "Hurricane",
    song == "14" ~ "The Reynolds Pamphlet",
    song == "15" ~ "Burn"
  ))

ggplot(data = act2_afinn_11_15,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Songs 16 - 20
act2_afinn_16_20 <- act2_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("16", "17", "18", "19", "20")) %>% 
  mutate(song = case_when(
    song == "16" ~ "Blow Us All Away",
    song == "17" ~ "Stay Alive - Reprise",
    song == "18" ~ "It's Quiet Uptown",
    song == "19" ~ "The Election of 1800",
    song == "20" ~ "Your Obedient Servant"
  ))

ggplot(data = act2_afinn_16_20,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Songs 21-23
act2_afinn_21_23 <- act2_afinn %>% 
  count(song, value) %>% 
  filter(song %in% c("21", "22", "23")) %>% 
  mutate(song = case_when(
    song == "21" ~ "Best of Wives and \nBest of Women",
    song == "22" ~ "The World Was Wide Enough",
    song == "23" ~ "Who Lives, Who Dies, \n Who Tells Your Story"
  ))

ggplot(data = act2_afinn_21_23,
       aes(x = value, y = n)) +
  geom_col() +
  facet_wrap(~song)

Mean
act2_afinn_means <- act2_afinn %>% 
  group_by(song) %>% 
  summarize(mean_afinn = mean(value))

ggplot(data = act2_afinn_means,
       aes(fct_rev(factor(song)),
           y = mean_afinn)) +
  geom_col() +
  coord_flip()

“NRC” lexicon

Act 1

act1_nrc <- words_act1_clean %>% 
  inner_join(get_sentiments("nrc"))

act1_nrc_counts <- act1_nrc %>% 
  count(song, sentiment)

ggplot(data = act1_nrc_counts,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 1-5
act1_nrc_1_5 <- act1_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("1", "2", "3", "4", "5")) %>% 
  mutate(song = case_when(
    song == "1" ~ "Alexander Hamilton",
    song == "2" ~ "Aaron Burr, Sir",
    song == "3" ~ "My Shot",
    song == "4" ~ "The Story of Tonight",
    song == "5" ~ "The Schuyler Sisters"
  ))
  
ggplot(data = act1_nrc_1_5,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 6-10
act1_nrc_6_10 <- act1_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("6", "7", "8", "9", "10")) %>% 
  mutate(song = case_when(
    song == "6" ~ "Farmer Refuted",
    song == "7" ~ "You'll Be Back",
    song == "8" ~ "Right Hand Man",
    song == "9" ~ "A Winter's Ball",
    song == "10" ~ "Helpless"
  ))
  
ggplot(data = act1_nrc_6_10,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 11-15
act1_nrc_11_15 <- act1_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("11", "12", "13", "14", "15")) %>% 
  mutate(song = case_when(
    song == "11" ~ "Satisfied",
    song == "12" ~ "The Story of \nTonight (Reprise)",
    song == "13" ~ "Wait for It",
    song == "14" ~ "Stay Alive",
    song == "15" ~ "Ten Duel \nCommandments"
  ))
  
ggplot(data = act1_nrc_11_15,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 16-20
act1_nrc_16_20 <- act1_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("16", "17", "18", "19", "20")) %>% 
  mutate(song = case_when(
    song == "16" ~ "Meet Me Inside",
    song == "17" ~ "That Would Be Enough",
    song == "18" ~ "Guns and Ships",
    song == "19" ~ "History Has Its \nEyes on You",
    song == "20" ~ "Yorktown (The World \nTurned Upside Down)"
  ))
  
ggplot(data = act1_nrc_16_20,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 21-23
act1_nrc_21_23 <- act1_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("21", "22", "23")) %>% 
  mutate(song = case_when(
    song == "21" ~ "What Comes Next?",
    song == "22" ~ "Dear Theodosia",
    song == "23" ~ "Non-Stop"
  ))
  
ggplot(data = act1_nrc_21_23,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Act 2

act2_nrc <- words_act2_clean %>% 
  inner_join(get_sentiments("nrc"))

act2_nrc_counts <- act2_nrc %>% 
  count(song, sentiment)

ggplot(data = act2_nrc_counts,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 1-5
act2_nrc_1_5 <- act2_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("1", "2", "3", "4", "5")) %>% 
  mutate(song = case_when(
    song == "1" ~ "What'd I Miss",
    song == "2" ~ "Cabinet Battle #1",
    song == "3" ~ "Take a Break",
    song == "4" ~ "Say No to This",
    song == "5" ~ "The Room Where \nIt Happens"
  ))
  
ggplot(data = act2_nrc_1_5,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 6-10
act2_nrc_6_10 <- act2_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("6", "7", "8", "9", "10")) %>% 
  mutate(song = case_when(
    song == "6" ~ "Schuyler Defeated",
    song == "7" ~ "Cabinet Battle #2",
    song == "8" ~ "Washington on Your Side",
    song == "9" ~ "One Last Time",
    song == "10" ~ "I Know Him"
  ))
  
ggplot(data = act2_nrc_6_10,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 11-15
act2_nrc_11_15 <- act2_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("11", "12", "13", "14", "15")) %>% 
  mutate(song = case_when(
    song == "11" ~ "The Adams Administration",
    song == "12" ~ "We Know",
    song == "13" ~ "Hurricane",
    song == "14" ~ "The Reynolds Pamphlet",
    song == "15" ~ "Burn"
  ))
  
ggplot(data = act2_nrc_11_15,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 16-20
act2_nrc_16_20 <- act2_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("16", "17", "18", "19", "20")) %>% 
  mutate(song = case_when(
    song == "16" ~ "Blow Us All Away",
    song == "17" ~ "Stay Alive - Reprise",
    song == "18" ~ "It's Quiet Uptown",
    song == "19" ~ "The Election of 1800",
    song == "20" ~ "Your Obedient Servant"
  ))
  
ggplot(data = act2_nrc_16_20,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()

Songs 21-23
act2_nrc_21_23 <- act2_nrc %>% 
  count(song, sentiment) %>% 
  filter(song %in% c("21", "22", "23")) %>% 
  mutate(song = case_when(
    song == "21" ~ "Best of Wives and \nBest of Women",
    song == "22" ~ "The World Was Wide Enough",
    song == "23" ~ "Who Lives, Who Dies, \n Who Tells Your Story"
  ))
  
ggplot(data = act2_nrc_21_23,
       aes(x = sentiment, y = n)) +
  geom_col() +
  facet_wrap(~song) +
  coord_flip()